Comparing numerical data across groups#

Setup#

import pandas as pd
import altair as alt
DataTransformerRegistry.enable('default')
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
alt.data_transformers.disable_max_rows()

Data#

Import data#

ROOT = "https://raw.githubusercontent.com/kirenz/datasets/master/"
DATA = "county.csv"

df = pd.read_csv(ROOT + DATA)
# Select only relevant variables

data_selection = ["state", "name", "pop_change", 
                  "population_change", "median_hh_income", "metro"]
                  
df = ___

Data corrections#

# drop missing values
df.dropna(inplace=True)
# rename variable population_change to change (use: inplace=True)
df.rename(columns={'___': '___'}, ___=___)
# change data type to category
df['change'] = df['change']___

Analysis#

# count the values
df['change'].___
no gain    1285
gain       1275
Name: change, dtype: int64
df['metro'].___
no     1615
yes     945
Name: metro, dtype: int64

Histogram for two groups#

# use median_hh_income and change as color 

___.___(___).___().___(
    ___=___.___("___", 
            bin=alt.BinParams(maxbins=___)),
    ___=___.___('___'),
    color=___.___('___')
)

Side-by-side box plot#

alt.Chart(df).mark_boxplot().encode(
    x=alt.X('median_hh_income'),
    y=alt.Y('change'),
    color=alt.Color('change'),
).properties(
    width=400,
    height=150,
    )

Faceting#

alt.Chart(df).mark_bar().encode(
    alt.X("median_hh_income", bin=alt.BinParams(maxbins=50)),
    alt.Y('count()'),
    alt.Column('metro'), # <--
    alt.Row('change'), # <--
).properties(
    width=200,
    height=100,
)
alt.Chart(df).mark_bar().encode(
    x=alt.X("median_hh_income", bin=alt.BinParams(maxbins=50)),
    y=alt.Y('count()'),
).properties(
    width=200,
    height=100,
).facet( # <--
    column='metro',
    row='change',
)

Pair plots#

alt.Chart(df).mark_circle().encode(
    x=alt.X(alt.repeat("column"), type='quantitative'),
    y=alt.Y(alt.repeat("row"), type='quantitative'),
    color=alt.Color('change'),
).properties(
    width=150,
    height=150
).repeat(
    row=['pop_change', 'median_hh_income'],
    column=['median_hh_income', 'pop_change']
).interactive()